DE and DTU Analyses

Load packages

First we will load in the packages needed to execute this analysis

library(rmdformats)
library(dplyr)
library(readr)
library(tidyr)
library(stringr)
library(magrittr)
library(ggplot2)
library(ggrepel)
library(viridis)
library(edgeR)
library(tximport)
library(DRIMSeq)
library(stageR)
library(ggfortify)
library(pheatmap)
library(cowplot)
library(ggbeeswarm)
library(tibble)
library(tictoc)
library(here)

Some methods will be called “hisat2”, this just means that bootstrapping estimates have not been taken into account. Nothing super special going on.

I have also defined several functions here for quality of life purposes

# source(here("R/get_dtelist.R"))
# source(here("R/get_filter_plot.R"))
source(here("R/de_testing.R"))
source(here("R/de_plotting.R"))
source(here("R/get_dtu_tximport.R"))
source(here("R/get_stager_res.R"))
source(here("R/draw_venn_STAR.R"))
# source(here("R/get_pca_obj_star.R"))
# source(here("R/get_pca_plot_star.R"))
# source(here("R/plot_blandr.R"))
# source(here("R/cor_funcs.R"))
# source(here("R/plot_blandr.R"))
# source(here("R/theme_justin.R"))

# From 01_Annotations
gene_txp_anno <- read_csv(here("data/grch38_103_df.csv.gz"))

txp_gene_ensdb_lengths <- read_csv(
  here("data/txp_gene_ensdb_lengths.csv.gz")
  )

transcript_key <- dplyr::select(txp_gene_ensdb_lengths,
                                c("transcript_id" = tx_id,
                                  "transcript_name" = tx_name))

# From 02_DTElist
hisat2_dtelist <- read_rds(here("data/hisat2_dtelist.rds"))
kallisto_dtelist <- read_rds(here("data/kallisto_dtelist.rds"))
salmon_dtelist <- read_rds(here("data/sa_dtelist.rds"))
star_dtelist <- read_rds(here("data/star_dtelist.rds"))

R Markdown

DTE and DTU Analysis

The question we are asking in this study is, “does the choice and implenetation of an alignment-quantification algorithm inherently bias the RNA-seq counts and downstream statistical tests of differential expression and usage?” From what others have already observed in numerous benchmarking studies is that different methods perform differently, use different algorithms, and are even tailored to answering different biological questions about the data. However, the impacts have been infrequently tested for gene expression, and not even considered for transcript expression. My hypothesis is, “the usage of different alignment-quantification methods will not produce any major difference in gene or transcript expression and downstream statistical analysis.” To tackle this hypothesis, I will conduct differential gene expression (DGE), differential transcript expression (DTE), and differential transcript usage (DTU). I will overlap the results to see how they compare at different perspectives. If there are any observable differences then I will interrogate the differences by comparing the relation of the variable genes with any biological differences to see if there is an overrepresentation of any biological bias. The four methods chosen are Kallisto, Salmon, Selective Alignment-Salmon, and Hisat2-Salmon

The code below details the differential expression and usage in each method ## Hisat2

# hisat2 ----
hisat2_lmfit <- hisat2_dtelist %>% 
  de_lmfit() %T>%
  write_csv(here("data/hisat2_lmfit.csv"))

hisat2_lmfit_sig <- hisat2_lmfit %>%
  dplyr::filter(DE == TRUE)

# Takes 25 minutes on my Macbook
# Takes 6 minutes on my Lenovo
tic("DTU")
message("Starting DTU now")
hisat2_dtu <- read_csv(here("data/dtu_cts_hisat2_salmon.csv.gz")) %>%
  get_dtu_tximport(annotation = gene_txp_anno) %>%
  get_stager_res() %T>%
  write_csv(here("data/hisat2_DTU.csv"))
message("DTU run finishing now")
toc()
## DTU: 391.261 sec elapsed
# DTU: 1319.4 seconds elapsed on Macbook
# DTU: 361.75 seconds elapsed on Lenovo

hisat2_dtu_sig <- hisat2_dtu %>%
  dplyr::filter(DTU == TRUE)

hisat2_dtu_sig_txp <- hisat2_dtu %>%
  dplyr::filter(txpDTU == TRUE)
# Check to see what needs labelling
hisat2_lmfit %>% plot_ma()

label_data <- hisat2_lmfit %>%
  dplyr::filter(FDR < 0.05 & abs(logFC) > 1.5) %>%
  mutate(group = case_when(
    FDR < 0.05 & logFC > 1 ~ "Up Regulated",
    FDR < 0.05 & logFC < -1 ~ "Down Regulated",
    .default = "Not Significant"
  ))


hisat2_ma <- hisat2_lmfit %>%
  plot_ma() +
  geom_label_repel(data = label_data,
                  aes(x = AveExpr, y = logFC, label = transcript_name),
                  size = 3,
                  max.overlaps = Inf,
                  force_pull = 0.5,
                  force = 3) +
  theme(legend.position = "none")

hisat2_volc <- hisat2_lmfit %>%
  plot_volc() +
  geom_label_repel(data = label_data,
                  aes(x = logFC, y = -log10(FDR), label = transcript_name),
                  size = 3,
                  max.overlaps = Inf,
                  force_pull = 0.5,
                  force = 3) +
  theme(legend.position = "none")

hisat2_legend <- cowplot::get_legend(plot_ma(hisat2_lmfit))

hisat2_de <- cowplot::plot_grid(hisat2_volc, hisat2_ma, hisat2_legend,
                                 rel_widths = c(1, 1, 0.3),
                                 nrow = 1)

hisat2_de

hisat2_de %>%
  ggsave(filename = "hisat2_ma_volc_plot.png",
         path = here("figures/"),
         device = "png",
         height = 140,
         width = 330,
         units = "mm",
         dpi = 400)

Salmon

salmon_lmfit <- salmon_dtelist %>% 
  de_lmfit() %T>%
  write_csv(here("data/sa_lmfit.csv"))

salmon_lmfit_sig <- salmon_lmfit %>%
  dplyr::filter(DE == TRUE)

salmon_dtu <- read_csv(
  here("data/dtu_cts_sa.csv.gz")
  ) %>%
  get_dtu_tximport(annotation = gene_txp_anno) %>%
  get_stager_res() %T>%
  write_csv(here("data/salmon_DTU.csv"))

salmon_dtu_sig <- salmon_dtu %>%
  dplyr::filter(DTU == TRUE)

salmon_dtu_sig_txp <- salmon_dtu %>%
  dplyr::filter(txpDTU == TRUE)
# Check to see what needs labelling
salmon_lmfit %>% plot_ma()

label_data <- salmon_lmfit %>%
  dplyr::filter(FDR < 0.05 & abs(logFC) > 6) %>%
  mutate(group = case_when(
    FDR < 0.05 & logFC > 1 ~ "Up Regulated",
    FDR < 0.05 & logFC < -1 ~ "Down Regulated",
    .default = "Not Significant"
  )) %>%
  dplyr::filter(!str_detect(seqnames, "CHR"))


salmon_ma <- salmon_lmfit %>%
  plot_ma() +
  geom_label_repel(data = label_data,
                  aes(x = AveExpr, y = logFC, label = transcript_name),
                  size = 3,
                  max.overlaps = Inf,
                  force_pull = 0.5,
                  force = 3) +
  theme(legend.position = "none")

salmon_volc <- salmon_lmfit %>%
  plot_volc() +
  geom_label_repel(data = label_data,
                  aes(x = logFC, y = -log10(FDR), label = transcript_name),
                  size = 3,
                  max.overlaps = Inf,
                  force_pull = 0.5,
                  force = 3) +
  theme(legend.position = "none")

salmon_legend <- cowplot::get_legend(plot_ma(salmon_lmfit))

salmon_de <- cowplot::plot_grid(salmon_volc, salmon_ma, salmon_legend,
                                 rel_widths = c(1, 1, 0.3),
                                 nrow = 1)

salmon_de

salmon_de %>%
  ggsave(filename = "salmon_ma_volc_plot.png",
         path = here("figures/"),
         device = "png",
         height = 140,
         width = 330,
         units = "mm",
         dpi = 400)

STAR

star_lmfit <- star_dtelist %>% 
  de_lmfit() %T>%
  write_csv(here("data/star_lmfit.csv"))

star_lmfit_sig <- star_lmfit %>%
  dplyr::filter(DE == TRUE)

star_dtu <- read_csv(
  here("data/dtu_cts_star_salmon.csv.gz")
  ) %>%
  get_dtu_tximport(annotation = gene_txp_anno) %>%
  get_stager_res() %T>%
  write_csv(here("data/star_DTU.csv"))

star_dtu_sig <- star_dtu %>%
  dplyr::filter(DTU == TRUE)

star_dtu_sig_txp <- star_dtu %>%
  dplyr::filter(txpDTU == TRUE)
# Check to see what needs labelling
star_lmfit %>% plot_ma()

label_data <- star_lmfit %>%
  dplyr::filter(FDR < 0.05 & abs(logFC) > 6) %>%
  mutate(group = case_when(
    FDR < 0.05 & logFC > 1 ~ "Up Regulated",
    FDR < 0.05 & logFC < -1 ~ "Down Regulated",
    .default = "Not Significant"
  )) %>% 
  dplyr::filter(!str_detect(seqnames, "CHR"))


star_ma <- star_lmfit %>%
  plot_ma() +
  geom_label_repel(data = label_data,
                  aes(x = AveExpr, y = logFC, label = transcript_name),
                  size = 3,
                  max.overlaps = Inf,
                  force_pull = 0.5,
                  force = 3) +
  theme(legend.position = "none")

star_volc <- star_lmfit %>%
  plot_volc() +
  geom_label_repel(data = label_data,
                  aes(x = logFC, y = -log10(FDR), label = transcript_name),
                  size = 3,
                  max.overlaps = Inf,
                  force_pull = 0.5,
                  force = 3) +
  theme(legend.position = "none")

star_legend <- cowplot::get_legend(plot_ma(star_lmfit))

star_de <- cowplot::plot_grid(star_volc, star_ma, star_legend,
                              rel_widths = c(1, 1, 0.3),
                              nrow = 1)

star_de

star_de %>%
  ggsave(filename = "star_ma_volc_plot.png",
         path = here("figures/"),
         device = "png",
         height = 140,
         width = 330,
         units = "mm",
         dpi = 400)

Kallisto

kallisto_lmfit <- kallisto_dtelist %>% 
  de_lmfit() %T>%
  write_csv(here("data/kallisto_lmfit.csv"))

kallisto_lmfit_sig <- kallisto_lmfit %>%
  dplyr::filter(DE == TRUE)

kallisto_dtu <- read_csv(
  here("data/dtu_cts_kallisto.csv.gz")
  ) %>%
  get_dtu_tximport(annotation = gene_txp_anno) %>%
  get_stager_res() %T>%
  write_csv(here("data/kallisto_DTU.csv"))

kallisto_dtu_sig <- kallisto_dtu %>%
  dplyr::filter(DTU == TRUE)

kallisto_dtu_sig_txp <- kallisto_dtu %>%
  dplyr::filter(txpDTU == TRUE)
# Check to see what needs labelling
kallisto_lmfit %>% plot_ma()

label_data <- kallisto_lmfit %>%
  dplyr::filter(FDR < 0.05 & abs(logFC) > 4) %>%
  mutate(group = case_when(
    FDR < 0.05 & logFC > 1 ~ "Up Regulated",
    FDR < 0.05 & logFC < -1 ~ "Down Regulated",
    .default = "Not Significant"
  ))


kallisto_ma <- kallisto_lmfit %>%
  plot_ma() +
  geom_label_repel(data = label_data,
                  aes(x = AveExpr, y = logFC, label = transcript_name),
                  size = 3,
                  max.overlaps = Inf,
                  force_pull = 0.5,
                  force = 3) +
  theme(legend.position = "none")

kallisto_volc <- kallisto_lmfit %>%
  plot_volc() +
  geom_label_repel(data = label_data,
                  aes(x = logFC, y = -log10(FDR), label = transcript_name),
                  size = 3,
                  max.overlaps = Inf,
                  force_pull = 0.5,
                  force = 3) +
  theme(legend.position = "none")

kallisto_legend <- cowplot::get_legend(plot_ma(kallisto_lmfit))

kallisto_de <- cowplot::plot_grid(kallisto_volc, kallisto_ma, kallisto_legend,
                                  rel_widths = c(1, 1, 0.3),
                                  nrow = 1)

kallisto_de

kallisto_de %>%
  ggsave(filename = "kallisto_ma_volc_plot.png",
         path = here("figures/"),
         device = "png",
         height = 140,
         width = 330,
         units = "mm",
         dpi = 400)

Venn Diagram (Figure 4)

venn_dte <- draw_venn(hisat2_res =  hisat2_lmfit_sig,
                      kallisto_res = kallisto_lmfit_sig,
                      star_res = star_lmfit_sig,
                      salmon_res = salmon_lmfit_sig) %>%
  as_grob()

ggsave(venn_dte,
       filename = "figure3_venn_dte.png",
       path = here("figures/"),
       device = "png",
       height = 10,
       width = 13,
       units = "cm")

hisat2_dtu_sig_gene <- hisat2_dtu %>% 
  dplyr::filter(DTU == TRUE) %>%
  dplyr::select(Gene, geneFDR, DTU) %>%
  dplyr::rename("Transcript" = Gene) %>%
  distinct()

kallisto_dtu_sig_gene <- kallisto_dtu %>% 
  dplyr::filter(DTU == TRUE) %>%
  dplyr::select(Gene, geneFDR, DTU) %>%
  dplyr::rename("Transcript" = Gene) %>%
  distinct()

star_dtu_sig_gene <- star_dtu %>%
  dplyr::filter(DTU == TRUE) %>%
  dplyr::select(Gene, geneFDR, DTU) %>% 
  dplyr::rename("Transcript" = Gene) %>%
  distinct()

salmon_dtu_sig_gene <- salmon_dtu %>%
  dplyr::filter(DTU == TRUE) %>%
  dplyr::select(Gene, geneFDR, DTU) %>%
  dplyr::rename("Transcript" = Gene) %>%
  distinct()

venn_dtu <- draw_venn(hisat2_res =  hisat2_dtu_sig_gene,
                      kallisto_res = kallisto_dtu_sig_gene,
                      star_res = star_dtu_sig_gene,
                      salmon_res = salmon_dtu_sig_gene) %>%
  as_grob()

venn_dtu
## gTree[GRID.gTree.938]
ggsave(venn_dtu,
       filename = "figure3_venn_dtu.png",
       path = here("figures/"),
       device = "png",
       height = 10,
       width = 13,
       units = "cm")

sig_dtu_transcripts

hisat2_dtu_sig_transcripts <- hisat2_dtu %>% 
  dplyr::filter(txpDTU == TRUE) %>%
  dplyr::select(Transcript, txpFDR, txpDTU) %>%
  distinct()

kallisto_dtu_sig_transcripts <- kallisto_dtu %>%
  dplyr::filter(txpDTU == TRUE) %>%
  dplyr::select(Transcript, txpFDR, txpDTU) %>%
  distinct()

star_dtu_sig_transcripts <- star_dtu %>%
  dplyr::filter(txpDTU == TRUE) %>%
  dplyr::select(Transcript, txpFDR, txpDTU) %>% 
  distinct()

salmon_dtu_sig_transcripts <- salmon_dtu %>%
  dplyr::filter(txpDTU == TRUE) %>%
  dplyr::select(Transcript, txpFDR, txpDTU) %>%
  distinct()

venn_dtu_txp <- draw_venn(hisat2_res =  hisat2_dtu_sig_transcripts,
                          kallisto_res = kallisto_dtu_sig_transcripts,
                          star_res = star_dtu_sig_transcripts,
                          salmon_res = salmon_dtu_sig_transcripts) %>%
  as_grob()

venn_dtu_txp
## gTree[GRID.gTree.966]
ggsave(venn_dtu_txp,
       filename = "figure4_venn_txpdtu.png",
       path = here("figures/"),
       device = "png",
       height = 10,
       width = 13,
       units = "cm")

Cowplot

venn_cowplot <- cowplot::plot_grid(
  venn_dte,
  venn_dtu_txp,
  venn_dtu,
  ncol = 1,
  labels = c("                              a. DTE Results", 
             "            b. DTU Transcript-level Results",
             "                  c. DTU Gene-level Results")
  )

venn_cowplot

ggsave(venn_cowplot,
       filename = "figure4_venn_cowplot.png",
       path = here("figures/"),
       height = 30,
       width = 20,
       units = "cm",
       dpi = 400)

Venn diagram for all

We make DE == TRUE for all rows here so that all genes are included in the venn diagram regardless of significance, just filtering

hisat2_venn_all <- hisat2_lmfit %>%
  mutate(DE = TRUE)

salmon_venn_all <- salmon_lmfit %>%
  mutate(DE = TRUE)

kallisto_venn_all <- kallisto_lmfit %>%
  mutate(DE = TRUE)

star_venn_all <- star_lmfit %>%
  mutate(DE = TRUE)

venn_dte_nofilter <- draw_venn(hisat2_res =  hisat2_venn_all,
                               kallisto_res = kallisto_venn_all,
                               star_res = star_venn_all,
                               salmon_res = salmon_venn_all) %>%
  as_grob()


ggsave(venn_dte_nofilter,
       filename = "figure_supp_venn_all_dte_filter_pass.png",
       path = here("figures/"),
       device = "png",
       height = 10,
       width = 13,
       units = "cm")


hisat2_dtu_sig_gene <- hisat2_dtu %>% 
  dplyr::select(Gene, geneFDR, DTU) %>%
  dplyr::rename("Transcript" = Gene) %>%
  distinct()

kallisto_dtu_sig_gene <- kallisto_dtu %>% 
  dplyr::select(Gene, geneFDR, DTU) %>%
  dplyr::rename("Transcript" = Gene) %>%
  distinct()

star_dtu_sig_gene <- star_dtu %>%
  dplyr::select(Gene, geneFDR, DTU) %>% 
  dplyr::rename("Transcript" = Gene) %>%
  distinct()

salmon_dtu_sig_gene <- salmon_dtu %>%
  dplyr::select(Gene, geneFDR, DTU) %>%
  dplyr::rename("Transcript" = Gene) %>%
  distinct()

venn_dtu_nofilter <- draw_venn(hisat2_res =  hisat2_dtu_sig_gene,
                               kallisto_res = kallisto_dtu_sig_gene,
                               star_res = star_dtu_sig_gene,
                               salmon_res = salmon_dtu_sig_gene) %>%
  as_grob()

ggsave(venn_dtu_nofilter,
       filename = "supp_venn_dtu_all_filter_pass.png",
       path = here("figures/"),
       device = "png",
       height = 10,
       width = 13,
       units = "cm")

Compare differences

hisat2_distinct <- hisat2_lmfit %>%
  dplyr::select(transcript_id,
                transcript_name,
                AveExpr, 
                logFC,
                P.Value,
                FDR,
                DE) %>%
  dplyr::mutate(distinct = case_when(
    !transcript_id %in% unique(
      rbind(salmon_lmfit,
            star_lmfit,
            kallisto_lmfit)[["transcript_id"]]
    ) ~ "Distinct",
    .default = "Not Distinct"))

hisat2_exclusive <- hisat2_distinct %>%
  dplyr::filter(distinct == "Distinct")

hisat2_distinct_plot <- hisat2_distinct %>%
  ggplot(aes(x = AveExpr, y = logFC, colour = distinct, shape = DE)) +
  geom_point() +
  geom_point(data = hisat2_exclusive) +
  scale_colour_manual(values = c("navy", "lightblue")) +
  labs(x = "Mean Transcript Expression (log2 CPM)",
       y = "log2 Fold Change",
       colour = "Exclusivity",
       shape = "Significant?") +
  theme_bw() +
  theme(axis.title = element_text(colour = "black", size = 14),
        axis.text = element_text(colour = "black", size = 12),
        legend.title = element_text(colour = "black", size = 14,
                                    face = "bold"),
        legend.text = element_text(colour = "black", size = 12))

hisat2_distinct_plot

# Save these plots
ggsave(plot = hisat2_distinct_plot,
       filename = "hisat2_distinct_plot.png",
       path = here("figures/"),
       height = 100,
       width = 200,
       units = "mm",
       device = "png",
       dpi = 400)
kallisto_distinct <- kallisto_lmfit %>%
  dplyr::select(transcript_id,
                transcript_name,
                AveExpr,
                logFC,
                P.Value, 
                FDR,
                DE) %>%
  dplyr::mutate(distinct = case_when(
    !transcript_id %in% unique(
      rbind(salmon_lmfit,
            star_lmfit,
            hisat2_lmfit)[["transcript_id"]]
    ) ~ "Distinct",
    .default = "Not Distinct"))

kallisto_exclusive <- kallisto_distinct %>%
  dplyr::filter(distinct == "Distinct")

kallisto_distinct_plot <- kallisto_distinct %>%
  ggplot(aes(x = AveExpr, y = logFC, colour = distinct, shape = DE)) +
  geom_point() +
  geom_point(data = kallisto_exclusive) +
  scale_colour_manual(values = c("navy", "lightblue")) +
  labs(x = "Mean Transcript Expression (log2 CPM)",
       y = "log2 Fold Change",
       colour = "Exclusivity",
       shape = "Significant?") +
  theme_bw() +
  theme(axis.title = element_text(colour = "black", size = 14),
        axis.text = element_text(colour = "black", size = 12),
        legend.title = element_text(colour = "black", size = 14,
                                    face = "bold"),
        legend.text = element_text(colour = "black", size = 12))

kallisto_distinct_plot

# Save these plots
ggsave(plot = kallisto_distinct_plot,
       filename = "kallisto_distinct_plot.png",
       path = here("figures/"),
       height = 100,
       width = 200,
       units = "mm",
       device = "png",
       dpi = 400)
star_distinct <- star_lmfit %>%
  dplyr::select(transcript_id,
                transcript_name,
                AveExpr,
                logFC,
                P.Value, 
                FDR,
                DE) %>%
  dplyr::mutate(distinct = case_when(
    !transcript_id %in% unique(
      rbind(salmon_lmfit,
            kallisto_lmfit,
            hisat2_lmfit)[["transcript_id"]]
    ) ~ "Distinct",
    .default = "Not Distinct"))

star_exclusive <- star_distinct %>%
  dplyr::filter(distinct == "Distinct")

star_distinct_plot <- star_distinct %>%
  ggplot(aes(x = AveExpr, y = logFC, colour = distinct, shape = DE)) +
  geom_point() +
  geom_point(data = star_exclusive) +
  scale_colour_manual(values = c("navy", "lightblue")) +
  labs(x = "Mean Transcript Expression (log2 CPM)",
       y = "log2 Fold Change",
       colour = "Exclusivity",
       shape = "Significant?") +
  theme_bw() +
  theme(axis.title = element_text(colour = "black", size = 14),
        axis.text = element_text(colour = "black", size = 12),
        legend.title = element_text(colour = "black", size = 14,
                                    face = "bold"),
        legend.text = element_text(colour = "black", size = 12))

star_distinct_plot

# Save these plots
ggsave(plot = star_distinct_plot,
       filename = "star_distinct_plot.png",
       path = here("figures/"),
       height = 100,
       width = 200,
       units = "mm",
       device = "png",
       dpi = 400)
salmon_distinct <- salmon_lmfit %>%
  dplyr::select(transcript_id,
                transcript_name,
                AveExpr,
                logFC,
                P.Value, 
                FDR,
                DE) %>%
  dplyr::mutate(distinct = case_when(
    !transcript_id %in% unique(
      rbind(star_lmfit,
            kallisto_lmfit,
            hisat2_lmfit)[["transcript_id"]]
    ) ~ "Distinct",
    .default = "Not Distinct"))

salmon_exclusive <- salmon_distinct %>%
  dplyr::filter(distinct == "Distinct")

salmon_distinct_plot <- salmon_distinct %>%
  ggplot(aes(x = AveExpr, y = logFC, colour = distinct, shape = DE)) +
  geom_point() +
  geom_point(data = salmon_exclusive) +
  scale_colour_manual(values = c("navy", "lightblue")) +
  labs(x = "Mean Transcript Expression (log2 CPM)",
       y = "log2 Fold Change",
       colour = "Exclusivity",
       shape = "Significant?") +
  theme_bw() +
  theme(axis.title = element_text(colour = "black", size = 14),
        axis.text = element_text(colour = "black", size = 12),
        legend.title = element_text(colour = "black", size = 14,
                                    face = "bold"),
        legend.text = element_text(colour = "black", size = 12))

salmon_distinct_plot

# Save these plots
ggsave(plot = salmon_distinct_plot,
       filename = "salmon_distinct_plot.png",
       path = here("figures/"),
       height = 100,
       width = 200,
       units = "mm",
       device = "png",
       dpi = 400)
tx_in_all_de <- hisat2_lmfit_sig %>%
  inner_join(salmon_lmfit,
             by = "transcript_id") %>%
  inner_join(star_lmfit,
             by = "transcript_id") %>%
  inner_join(kallisto_lmfit,
             by = "transcript_id")

# Get the hisat2 transcripts
hisat2_dte_exclusive <- hisat2_lmfit_sig %>%
  dplyr::filter(DE == TRUE) %>%
  dplyr::filter(!transcript_id %in% salmon_lmfit_sig$transcript_id) %>%
  dplyr::filter(!transcript_id %in% star_lmfit_sig$transcript_id) %>%
  dplyr::filter(!transcript_id %in% kallisto_lmfit_sig$transcript_id)

hisat2_dte_exclusive %>%
  write_csv(here("data/hisat2_exclusive_tx.csv"))

# Get salmon transcripts
salmon_dte_exclusive <- salmon_lmfit_sig %>%
  dplyr::filter(DE == TRUE) %>%
  dplyr::filter(!transcript_id %in% hisat2_lmfit_sig$transcript_id) %>%
  dplyr::filter(!transcript_id %in% star_lmfit_sig$transcript_id) %>%
  dplyr::filter(!transcript_id %in% kallisto_lmfit_sig$transcript_id)

# Group to be used for kmer analysis
salmon_dte_exclusive %>%
  head(28)
## # A tibble: 28 × 24
##    gene_id    gene_length transcript_length seqnames  start    end  width strand
##    <chr>            <dbl>             <dbl> <chr>     <dbl>  <dbl>  <dbl> <chr> 
##  1 ENSG00000…        5944              4914 10       3.18e7 3.19e7 123378 -     
##  2 ENSG00000…       15174             11501 11       1.97e7 2.01e7 408765 +     
##  3 ENSG00000…        4476              4303 9        1.21e8 1.21e8  26781 -     
##  4 ENSG00000…        6448              2427 1        1.51e8 1.51e8  64649 -     
##  5 ENSG00000…       16474              9588 4        1.05e8 1.05e8 133018 +     
##  6 ENSG00000…       14716              3092 12       6.45e7 6.45e7  49957 +     
##  7 ENSG00000…       11558              4745 3        1.91e8 1.91e8 137430 +     
##  8 ENSG00000…        4912              2489 19       7.28e5 7.48e5  20544 +     
##  9 ENSG00000…        4341              3202 CHR_HSC… 3.22e7 3.22e7   5443 -     
## 10 ENSG00000…        4849              1884 11       1.23e8 1.23e8   2889 -     
## # ℹ 18 more rows
## # ℹ 16 more variables: tx_biotype <chr>, tx_cds_seq_start <dbl>,
## #   tx_cds_seq_end <dbl>, tx_support_level <lgl>, tx_id_version <chr>,
## #   gc_content <dbl>, tx_name <chr>, transcript_id <chr>,
## #   transcript_name <chr>, logFC <dbl>, AveExpr <dbl>, t <dbl>, P.Value <dbl>,
## #   FDR <dbl>, B <dbl>, DE <lgl>
salmon_dte_exclusive %>%
  write_csv(here("data/salmon_exclusive_tx.csv"))

# Get SA transcripts
star_dte_exclusive <- star_lmfit_sig %>%
  dplyr::filter(DE == TRUE) %>%
  dplyr::filter(!transcript_id %in% hisat2_lmfit_sig$transcript_id) %>%
  dplyr::filter(!transcript_id %in% salmon_lmfit_sig$transcript_id) %>%
  dplyr::filter(!transcript_id %in% kallisto_lmfit_sig$transcript_id)

# Group to be used for kmer analysis
star_dte_exclusive %>%
  head(28)
## # A tibble: 28 × 24
##    gene_id    gene_length transcript_length seqnames  start    end  width strand
##    <chr>            <dbl>             <dbl> <chr>     <dbl>  <dbl>  <dbl> <chr> 
##  1 ENSG00000…        7926               449 12       5.35e7 5.35e7   1317 +     
##  2 ENSG00000…        6710              1847 12       1.04e8 1.04e8  38633 +     
##  3 ENSG00000…        7173              5175 12       1.21e8 1.21e8  60361 -     
##  4 ENSG00000…        7392              5884 22       2.96e7 2.97e7  94978 +     
##  5 ENSG00000…       14561              3780 10       3.29e7 3.30e7  57842 -     
##  6 ENSG00000…        6821              1209 17       6.85e7 6.86e7  35920 +     
##  7 ENSG00000…        7518              1009 11       1.02e8 1.02e8  19578 +     
##  8 ENSG00000…        3785              1621 3        5.27e7 5.27e7   9128 -     
##  9 ENSG00000…        2786              1691 22       3.71e7 3.71e7  11652 +     
## 10 ENSG00000…       13755              4674 4        3.32e6 3.44e6 123844 +     
## # ℹ 18 more rows
## # ℹ 16 more variables: tx_biotype <chr>, tx_cds_seq_start <dbl>,
## #   tx_cds_seq_end <dbl>, tx_support_level <lgl>, tx_id_version <chr>,
## #   gc_content <dbl>, tx_name <chr>, transcript_id <chr>,
## #   transcript_name <chr>, logFC <dbl>, AveExpr <dbl>, t <dbl>, P.Value <dbl>,
## #   FDR <dbl>, B <dbl>, DE <lgl>
star_dte_exclusive %>%
  write_csv(here("data/star_exclusive_tx.csv"))

# Get Kallisto transcripts
kallisto_dte_exclusive <- kallisto_lmfit_sig %>%
  dplyr::filter(DE == TRUE) %>%
  dplyr::filter(!transcript_id %in% salmon_lmfit_sig$transcript_id) %>%
  dplyr::filter(!transcript_id %in% star_lmfit_sig$transcript_id) %>%
  dplyr::filter(!transcript_id %in% hisat2_lmfit_sig$transcript_id)

# Group to be used for kmer analysis
kallisto_dte_exclusive %>%
  head(28)
## # A tibble: 28 × 24
##    gene_id    gene_length transcript_length seqnames  start    end  width strand
##    <chr>            <dbl>             <dbl> <chr>     <dbl>  <dbl>  <dbl> <chr> 
##  1 ENSG00000…        6366              1336 7        1.05e8 1.05e8  24949 -     
##  2 ENSG00000…       10605              8133 17       1.57e6 1.63e6  59567 -     
##  3 ENSG00000…        7645              6007 6        1.11e8 1.11e8 205232 -     
##  4 ENSG00000…        6644              4697 2        5.50e7 5.51e7  78263 -     
##  5 ENSG00000…        1401               561 16       1.64e7 1.64e7   2975 +     
##  6 ENSG00000…        7650              7230 CHR_HSC… 2.99e7 3.00e7 122345 -     
##  7 ENSG00000…        5294              2704 1        1.59e8 1.59e8  45234 +     
##  8 ENSG00000…        9981              1530 15       6.71e7 6.72e7  52894 +     
##  9 ENSG00000…       29523              2822 20       4.73e5 5.44e5  71225 -     
## 10 ENSG00000…        5349              3293 22       5.03e7 5.04e7 101044 +     
## # ℹ 18 more rows
## # ℹ 16 more variables: tx_biotype <chr>, tx_cds_seq_start <dbl>,
## #   tx_cds_seq_end <dbl>, tx_support_level <lgl>, tx_id_version <chr>,
## #   gc_content <dbl>, tx_name <chr>, transcript_id <chr>,
## #   transcript_name <chr>, logFC <dbl>, AveExpr <dbl>, t <dbl>, P.Value <dbl>,
## #   FDR <dbl>, B <dbl>, DE <lgl>
kallisto_dte_exclusive %>%
  write_csv(here("data/kallisto_exclusive_tx.csv"))

# Get transcripts found in all methods
all_methods_dte <- hisat2_lmfit_sig %>%
  dplyr::filter(transcript_id %in% salmon_lmfit_sig$transcript_id) %>%
  dplyr::filter(transcript_id %in% star_lmfit_sig$transcript_id) %>%
  dplyr::filter(transcript_id %in% kallisto_lmfit_sig$transcript_id) %>%
  dplyr::filter(DE == TRUE)

all_methods_dte %>%
  write_csv(here("data/all_methods_tx.csv"))
tx_in_all_dtu <- hisat2_dtu_sig %>%
  inner_join(salmon_dtu_sig,
             by = "Transcript") %>%
  inner_join(star_dtu_sig,
             by = "Transcript") %>%
  inner_join(kallisto_dtu_sig,
             by = "Transcript")

hisat2_dtu_exclusive <- hisat2_dtu_sig %>%
  dplyr::filter(!Transcript %in% salmon_dtu_sig$Transcript) %>%
  dplyr::filter(!Transcript %in% star_dtu_sig$Transcript) %>%
  dplyr::filter(!Transcript %in% kallisto_dtu_sig$Transcript)

hisat2_dtu_exclusive %>%
  write_csv(here("data/hisat2_exclusive_dtu.csv"))

salmon_dtu_exclusive <- salmon_dtu_sig %>%
  dplyr::filter(!Transcript %in% hisat2_dtu_sig$Transcript) %>%
  dplyr::filter(!Transcript %in% star_dtu_sig$Transcript) %>%
  dplyr::filter(!Transcript %in% kallisto_dtu_sig$Transcript)

salmon_dtu_exclusive %>%
  write_csv(here("data/salmon_exclusive_dtu.csv"))

star_dtu_exclusive <- star_dtu_sig %>%
  dplyr::filter(!Transcript %in% hisat2_dtu_sig$Transcript) %>%
  dplyr::filter(!Transcript %in% salmon_dtu_sig$Transcript) %>%
  dplyr::filter(!Transcript %in% kallisto_dtu_sig$Transcript)

star_dtu_exclusive %>%
  write_csv(here("data/star_exclusive_dtu.csv"))

kallisto_dtu_exclusive <- kallisto_dtu_sig %>%
  dplyr::filter(!Transcript %in% salmon_dtu_sig$Transcript) %>%
  dplyr::filter(!Transcript %in% star_dtu_sig$Transcript) %>%
  dplyr::filter(!Transcript %in% hisat2_dtu_sig$Transcript)

kallisto_dtu_exclusive %>%
  write_csv(here("data/kallisto_exclusive_dtu.csv"))

all_methods_dtu <- hisat2_dtu_sig %>%
  dplyr::filter(Transcript %in% salmon_dtu_sig$Transcript) %>%
  dplyr::filter(Transcript %in% star_dtu_sig$Transcript) %>%
  dplyr::filter(Transcript %in% kallisto_dtu_sig$Transcript)

all_methods_dtu %>%
  write_csv(here("data/all_methods_tx.csv"))

Investigate PCBP2

pcbp2_comp_df <- full_join(
  hisat2_lmfit %>%
    dplyr::select(transcript_name,
                  "logFC_hisat2" = logFC,
                  "FDR_hisat2" = FDR) %>%
    dplyr::filter(str_detect(transcript_name, "PCBP2")),
  kallisto_lmfit %>% 
    dplyr::select(transcript_name,
                  "logFC_kallisto" = logFC,
                  "FDR_kallisto" = FDR) %>%
    dplyr::filter(str_detect(transcript_name, "PCBP2")),
  by = "transcript_name") %>%
  full_join(salmon_lmfit %>%
              dplyr::select(transcript_name,
                            "logFC_salmon" = logFC,
                            "FDR_salmon" = FDR) %>%
              dplyr::filter(str_detect(transcript_name, "PCBP2")),
            by = "transcript_name") %>% 
  full_join(star_lmfit %>%
              dplyr::select(transcript_name,
                            "logFC_STAR" = logFC,
                            "FDR_STAR" = FDR) %>%
              dplyr::filter(str_detect(transcript_name, "PCBP2")),
            by = "transcript_name")

logfc_pcbp2 <- pcbp2_comp_df %>%
  pivot_longer(cols = starts_with("logFC"),
               values_to = "logFC",
               names_to = "group") %>%
  ggplot(aes(x = group, y = logFC, colour = group)) +
  geom_point(size = 3) +
  scale_colour_manual(values = c("#61c3d7", "#4fc14d",
                                 "#bf4dc1", "#f6866f")) +
  facet_wrap(~ transcript_name,
             nrow = 5, ncol = 5) +
  theme_bw() +
  theme(axis.text.x = element_text(size = 12, colour = "black", angle = 90))

ggsave(plot = logfc_pcbp2,
       filename = "logfc_pcbp2.png",
       path = here("figures/"),
       height = 300,
       width = 320,
       units = "mm",
       device = "png",
       dpi = 400)

fdr_pcbp2 <- pcbp2_comp_df %>%
  pivot_longer(cols = starts_with("FDR"),
               values_to = "FDR",
               names_to = "group") %>%
  ggplot(aes(x = group, y = -log2(FDR), colour = group)) +
  geom_point(size = 3) +
  scale_colour_manual(values = c("#61c3d7", "#4fc14d",
                                 "#bf4dc1", "#f6866f")) +
  facet_wrap(~ transcript_name,
             nrow = 5, ncol = 5) +
  theme_bw() +
  theme(axis.text.x = element_text(size = 12, colour = "black", angle = 90))

fdr_pcbp2

Save the plot

ggsave(plot = fdr_pcbp2,
       filename = "fdr_pcbp2.png",
       path = here("figures/"),
       height = 300,
       width = 320,
       units = "mm",
       device = "png",
       dpi = 400)